; Some triangle Bezier patch procedures
;calc_tri:
if 0
cart2barycentric:
; cartesian to barycentric coords
; in:
;   xmm0 = cartesian position
;   xmm1, xmm2, xmm3 = base vertices position (triangle corners)
; out:
;   xmm0 = s, t, u  coords
   .vt1   equ [ebp-24]
   .vt2   equ [ebp-36]       ; Way to convert cartesian coords to barycentric:
   .vt3   equ [ebp-48]       ;    .v2/|\          Point @ baryc. coords are
   .cp    equ [ebp-60]       ;      / |  \        u=a/p, t=b/p, s=c/p=1-s-t;
   .p     equ [ebp-84]       ;  d1 /b | a \ d2    where p is area of whole tri,
   .a     equ [ebp-88]       ;    /  /@\   \      a, b, c are areas of smaller
   .b     equ [ebp-92]       ;   / /  c  \  \     adjacents tris.
;   .a3    equ [ebp-96]      ;  /____________\
   .u      equ [ebp-100]     ;.v1     d3      .v3
   .stu    equ [ebp-108]
   push   ebp
   mov    ebp,esp
   sub    esp,110
   movups .cp,xmm0
   movups .vt3,xmm3
   movups .vt2,xmm2
   movups .vt1,xmm1
   lea    edi,.cp
   lea    eax,.vt1
   lea    edx,.vt3
   lea    ebx,.vt2
   sub    eax,edi ; vt1  ;36 ; vert 1
   sub    ebx,edi ; t3   ;12 ; vert 3
   sub    edx,edi ; vt2  ;24 ; vert 2
;calculating triangle parameters
; in:
;    edi = base addres
;    eax, ebx, edx = vertices addresses, I assume
;    all are factored by 12 number
; out:
;    xm0 - xm2 - vectors
;    xm3 - xm5 - normalized direction vects
;    xm6 - lenghts of edges as follows lo->hi edx-eax, ab, db;
;          hgst float = height of triangle vect prependicular
;          to eax-edx edge
;    xm7 hgst dword = area of triangle
; changes -> all xmmXX registers
;         -> none general
   call   calc_tri
   shufps xmm7,xmm7,11111111b
   movss  .p,xmm7
   lea    edi,.cp
   lea    eax,.vt1
   lea    ebx,.vt2
   sub    eax,edi ; vt1  ;36 ; vert 1
   sub    ebx,edi ; t3   ;12 ; vert 3
   xor    edx,edx    ; cp
   call   calc_tri
   shufps xmm7,xmm7,11111111b
   movss  .b,xmm7
   lea    edi,.cp
   lea    eax,.vt1
   lea    edx,.vt3
   lea    ebx,.vt2
;   sub    eax,edi ; vt1  ;36 ; vert 1
   sub    ebx,edi ; t3   ;12 ; vert 3
   sub    edx,edi ; vt2  ;24 ; vert 2
;   mov    ebx,12     ; vert 3
;   mov    edx,24     ; vert 2
   xor    eax,eax    ; cp
   call   calc_tri
   shufps xmm7,xmm7,11111111b
   movss  .a,xmm7
 ;  lea    edi,.cp
 ;  lea    eax,.vt1
 ;  lea    edx,.vt3
;   lea    ebx,.vt2
  ; sub    eax,edi ; vt1  ;36 ; vert 1
;   sub    ebx,edi ; t3   ;12 ; vert 3
  ; sub    edx,edi ; vt2  ;24 ; vert 2
;   mov    ebx,12     ; vert 3
;   mov    edx,36     ; vert 1
  ; xor    ebx,ebx    ; cp
  ; call   calc_tri
  ; shufps xmm7,xmm7,11111111b
;   movss  .a,xmm7
   rcpss  xmm6,.p
   shufps xmm6,xmm6,0
   movhps xmm7,.b    ; u=a/p, t=b/p, s=c/p=1-s-t;
   shufps xmm7,xmm7,01001011b ; l->h = a, b
   mov    eax,1
   mulps  xmm7,xmm6  ; xm7 lo->hi =   u, .., t, ..
   movaps xmm6,xmm7
   haddps xmm6,xmm6
;   movhlps xmm6,xmm7
   cvtsi2ss xmm5,eax
;   addps  xmm6,xmm7
;   subps xmm5,xmm6
   subps xmm5,xmm6   ; xm5 = s=1-s-t
;   shufps xmm7,xmm7,11001011b
;   movups .stu,xmm7
;   movss  .stu,xmm5
;   movups xmm0,.stu
   movaps xmm0,xmm7
   movlhps xmm0,xmm5
   mov    esp,ebp
   pop    ebp
ret
end if
;=======================
prepare_bez3_factors:
; in:
;  xmm0 : 1331 value
;  xmm1 : s,t,u values
;  ebx  : triangle border values,
;          I mean base nodes values
; out:
;  xmm5 : final vertex
     push    ebp
     mov     ebp,esp
     sub     esp,80
     and     ebp,-16
     .u3     equ [ebp-8]
     .stu3   equ [ebp-16]
     .u2     equ [ebp-24]
     .stu2   equ [ebp-32]
     .v1331  equ [ebp-48]
     .u      equ [ebp-56]
     .stu    equ [ebp-64]
     ; should prapare all factors
     ; triangle related, all described on ascii art below
     movups  .v1331,xmm0
     movaps  .stu,xmm1
     movaps  xmm3,xmm1
     mulps   xmm3,xmm3
     movups  .stu2,xmm3
     mulps   xmm3,xmm1
     movups  .stu3,xmm3
     movlps  xmm3,.stu2
     movhps  xmm3,.stu3
     shufps  xmm3,xmm3,11110010b ; xm3 = lo-> hi:s^3  s^2  ..
     movlhps xmm0,xmm1           ; xm0 = lo-> hi:1    3    s  t
     shufps  xmm0,xmm0,11010010b ; xm0 = l -> h: s    1    3  t
     movlhps xmm3,xmm0           ; xm3 = l -> h: s^3  s^2  s  1
     ; s^3  s^2t  st^3   t^3
     movlps  xmm4,.stu2
     movhps  xmm4,.stu3
     shufps  xmm4,xmm4,11110111b ; xm0 = l -> h: t^3 t^2   ...
     shufps  xmm0,xmm0,00100111b ; xm0 = l -> h: t   1     3  s
     movlhps xmm4,xmm0           ; xm4 = l -> h: t^3 t^2   t  1
     movaps  xmm5,xmm4
     shufps  xmm4,xmm4,00011011b ; xm4 invert = l -> h: 1 t t^2 t^3
     ; t, s  packs ready..
     movups  xmm7,[ebx]
     mulps   xmm7,.v1331
     mulps   xmm7,xmm4
     mulps   xmm7,xmm3
     ; xm7 = first 4 of sub sum
     movlps  xmm4,.u
     movhps  xmm4,.u2
     shufps  xmm4,xmm4,11111000b ; xm4 = l -> h: u   u^2  ...
     shufps  xmm5,xmm5,00000110b ; xm5 = l -> h: t^2 t    1   1
     movhps  xmm4,.u3            ; xm4 = l -> h: u   u^2  u^3
     shufps  xmm4,xmm4,01100100b ; xm4 = l -> h: u   u^2  u^3 u^2
     shufps  xmm3,xmm3,10111111b ; xm3 = l -> h: 1   1    1   s
     movups  xmm6,[ebx+16]
     mulps   xmm6,xmm4 ; u
     mulps   xmm6,xmm5 ; t
     mulps   xmm6,xmm3 ; s
     movups  xmm0,.v1331
     shufps  xmm0,xmm0,10100010b ; xm0 = l->h: 3 3 1 3
     mulps   xmm6,xmm0
     ; xm6 = sec 4 of sub sum
     ; now two last sub sum values...
     movups  xmm1,.stu2
     movups  xmm2,.stu
     movaps  xmm3,xmm2
     movaps  xmm4,xmm2
     shufps  xmm2,xmm2,11110010b ; u
     mulps   xmm1,xmm2           ; xm1 = s^2 * u
     shufps  xmm4,xmm4,01010101b ; xm4 = brd t
     mulps   xmm4,xmm3
     mulps   xmm4,xmm2           ; xm4 = s*t*u
     punpckldq xmm1,xmm4         ; xm1 = l -> h: s^2*u   s*t*u  ....
     addps   xmm0,xmm0
     movhps  xmm0,.v1331
     shufps  xmm0,xmm0,11110011b ; xm0 = l -> h: 3  6 .....
     movups  xmm2,[ebx+32]
     mulps   xmm2,xmm0
     mulps   xmm2,xmm1
     ; xm2 = 3rd 2 of sub sum
     addps   xmm6,xmm7
     movhlps xmm5,xmm6
     addps   xmm5,xmm6
     addps   xmm5,xmm2
     haddps  xmm5,xmm5
     ; xm5 = final vertex value
     ; see * EQUATION * below
     add     esp,80
     pop     ebp
ret
;================================================================================
calc_bez3_patch:
        ;  Attempt for calculating triangle based Bezier patch
        ;  in:
        ;    eax = number of steps/quality, should be divisable by 2
        ;    edi = place for calculated vertices
        ;    ebx = ten (10) base points,
        ;    ordered as follows:
        ;    p300, p201, p102, p003, p012, p021, p030
        ;    p120, p201
        ;  I use equation:
        ;  **** EQUATION below ****
        ;  y = 1*p300*s^3+p201*s^2*u*3+p102*s*u^2*3+poo3*u^3*1+
        ;  + po12*t*u^3 ... and so on, see
        ;  ascii art triangle...
        ;  *** EQUATION above *****
        ;    In prepare_bez3_factors proc I change order (see below).
        ;    Proc iterates x,y,z values and translates into  barycentric
        ;    positions.
        ;    u = 1-s-t
        push         ebp            ;                   1
        mov          ebp,esp        ;                  t^3
        and          ebp,-16        ;                 p030
        sub          esp,255        ;            3     /\
        sub          ebp,128        ;          st^2   /  \     3
                                    ;          p120  /    \p021 t^2u
        .cur_u       equ [ebp-8]    ;               /      \
        .cur_t       equ [ebp-12]   ;        3     /   6    \
        .cur_s       equ [ebp-16]   ;       s^2t  /   stu    \     3
        ; current values of         ;       p210 /    p111    \ p012 tu^2
        ; barycentric params        ;           /              \
                                    ;          /                \
                                    ;       1 /__________________\
        .v1331       equ [ebp-32]   ;      p300   p201  p102    p003
        .cu          equ [ebp-40]   ;       s^3   s^2u  su^2    u^3
        .ct          equ [ebp-44]   ;               3    3      1
        .cs          equ [ebp-48]   ; Thanks to Alan Wolfe
        .st_count    equ [ebp-52]   ; for nice, easy to implement
                                    ; problem description
        .nodes_base  equ [ebp-56]
        .dest        equ [ebp-60]
        .vert1       equ [ebp-80]
        .vert2       equ [ebp-96]
        .vert3       equ [ebp-112]
        .x_base      equ [ebp]
        .y_base      equ [ebp+40]
        .z_base      equ [ebp+80]
         mov         .dest,edi
         mov         .st_count,eax
         cld
         movups      xmm7,[ebx]
         movups      xmm6,[ebx+36]
         movups      xmm5,[ebx+72]
         movaps      .vert3,xmm5
         movaps      .vert2,xmm6
         movaps      .vert1,xmm7
         push        dword 1
         push        dword 3
         cvtpi2ps    xmm0,[esp]
         shufps      xmm0,xmm0,01000001b
         add         esp,8
         movups      .v1331,xmm0
         ; I will try shrink src code to prevent fails and bugs..
         ; Through ex. aligned values ...
         ; I interpolate cartes. coords instead
         ; barycentr.
         mov         .st_count,eax
         xorps       xmm7,xmm7
         movaps      .cs,xmm7      ; curr counters
         mov         .nodes_base,ebx
         cld
         lea         edi,.x_base
         mov         ecx,3
      .tra:
         push        ecx
         mov         ecx,10
         mov         esi,ebx    ; init/reorder base nodes
       @@:
         movsd
         add         esi,8
         loop        @b
         add         ebx,4
         pop         ecx
         loop        .tra
         mov         esi,bezier3_tris
         mov         [rand_seed],0xf0f0f0dd
     .ll_s:
         xor         eax,eax
         mov         .ct,eax
     .ll_t:
         push        esi
      ;   movups      xmm0,.currv     ; curr vertex value
      ;   movups      xmm1,.vert1
      ;   movups      xmm2,.vert2
      ;   movups      xmm3,.vert3
      ;   call        cart2barycentric
      ;   movaps      .cur_s,xmm0
;  in  - ecx - min
;        edx - max
;  out - eax - random number
         pushad
         xor      ecx,ecx
         mov      edx,100
         call     random
         push     eax          ; calc random
         xor      ecx,ecx      ; bartcentric coords
         mov      edx,100
         call     random
         push     eax
         cvtpi2ps xmm1,[esp]
         add      esp,8
         popad
         mov      eax,1
         cvtsi2ss xmm3,eax
         mov      ebx,255
         push     ebx ebx
         cvtpi2ps xmm4,[esp]
         rcpps    xmm4,xmm4
         mulps    xmm1,xmm4
         add      esp,8
         movaps   xmm0,xmm1
         haddps   xmm1,xmm1
         subps    xmm3,xmm1
         movlhps  xmm0,xmm3
         movaps   .cur_s,xmm0
      ;   movups       xmm1,[esi]
      ;   movups      .cur_s,xmm1
         movups      xmm0,.v1331
         movups      xmm1,.cur_s
         lea         ebx,.x_base  ; this are reorganizated base nodes
         call        prepare_bez3_factors
         mov         edi,.dest
         movss       [edi],xmm5
         movups      xmm0,.v1331
         movups      xmm1,.cur_s
         lea         ebx,.y_base
         call        prepare_bez3_factors
         mov         edi,.dest
         movss       [edi+4],xmm5
         movups      xmm0,.v1331
         movups      xmm1,.cur_s
         lea         ebx,.z_base
         call        prepare_bez3_factors
         mov         edi,.dest
         movss       [edi+8],xmm5
         add         .dest,dword 12
         pop         esi
         add         esi,12
         inc         dword .ct
         mov         eax,.st_count
         cmp         .ct,eax
         jnz         .ll_t
    ;     movups      xmm0,.vb
    ;     movups      xmm1,.ve
    ;     addps       xmm0,.d2  ; edge v2 -- v3
    ;     addps       xmm1,.d3  ; edge v1 -- v3
    ;     movups      .vb,xmm0
    ;     movups      .currv,xmm0
    ;     movups      .ve,xmm1
         inc         dword .cs
         mov         eax,.st_count
         cmp         .cs,eax
         jnz         .ll_s
         ; Now, I assume I calculate vertices of single Bezier triangle patch...
         ; Time to mesh it..
         ; first times vertex only model, to test and check the way to mesh....
         add         esp,255
         pop         ebp
         ret
mesh:
; in:
;  rcx = points/verts number
;  esi = ptr to verts
;  edi = ptr to tris
; out:
;  ecx = number of tris
          push           ebp
          mov            ebp,esp
          .verts_ptr     equ [ebp-4]
          .verts_cnt     equ [ebp-8]
          .lenght        equ [ebp-12]
          .curr_v_index  equ [ebp-16]
          .curr_v_index2 equ [ebp-20]
          .in_busy       equ [ebp-24]   ; busy indicator
          .tris_ptr      equ [ebp-28]   ; curr value
          .tri_cnt       equ [ebp-32]
          .edg_cnt       equ [ebp-36]
          .edges         equ [ebp-40]
          .edges2        equ [ebp-44]
          .edg3          equ [ebp-60]
          push     esi
          push     ecx
          sub      esp,60
          mov      .tris_ptr,edi
          xor      eax,eax
          mov      .edg_cnt,eax
          mov      ebx,ecx
          shr      ebx,8
          add      ebx,10
          malloc   ebx
          mov      .in_busy,eax
          mov      ecx,.verts_cnt
          mov      edi,eax
          xor      eax,eax
          shr      ecx,32
          inc      ecx
          cld
          rep      stosd           ; zero busy indicator
          shl      ebx,12
          malloc   ebx
          mov      .edges,eax
          malloc   ebx
          mov      .edges2,eax
     if 1
          ;  I will use algo, as follows
          ;  1. I search for shortest possibly edges made from distinct vertices
          ;  2. For every edge, again I search for one possible nearest vertex
          ;  3. Now I get some possible non clenched triangles
          ;  4. Now I can search for closed vertex located on the second side
          ; of edge....
          mov      edi,.edges
        .search2:
          or       ecx,-1
        @@:
          inc      ecx
          cmp      ecx,.verts_cnt
          je       .end_edg         ; all are busy
          mov      eax,ecx          ; search for 1st not busy
          mov      ebx,ecx
          shr      ebx,8
          and      eax,0xff
          add      ebx,.in_busy
          bt       [ebx],eax
          jc       @b
          mov      esi,ecx
          imul     esi,12
          add      esi,.verts_ptr
          movups   xmm0,[esi]       ; load first not busy
          ; esi was intialised to first value
          mov      .lenght,dword 100000.0
          mov      [edi],ecx
          or       eax,-1
          mov      .curr_v_index,eax
          mov      .curr_v_index2,eax
     ;     push     ecx esi
     ;     mov      esi,.verts_ptr
     ;     or       ecx,-1
        .search:
          inc      ecx
          add      esi,12
          movups   xmm1,[esi]
          subps    xmm1,xmm0
          dpps     xmm1,xmm1,01110111b
          sqrtps   xmm1,xmm1
          comiss   xmm1,.lenght
          ja       @f
          mov      eax,ecx
          mov      ebx,ecx
          shr      ebx,8
          and      eax,0xff
          add      ebx,.in_busy
          bt       [ebx],eax
          jc       @f
          movss    .lenght,xmm1
          mov      .curr_v_index,ecx
         @@:
          cmp      ecx,.verts_cnt
          jne      .search
      ;    pop      esi ecx
          mov      ebx,.curr_v_index
          or       eax,-1
          cmp      ebx,eax
          ; check if whole edge is found
          je      .end_edg            ; cannot find at last tri base verst
                                      ; end edge part of proc
          mov      [edi+4],ebx        ; append edge to list
          inc      dword .edg_cnt
          mov      eax,.curr_v_index    ; mark in  busy buff
          mov      ebx,eax
          shr      ebx,8
          and      eax,0xff
          add      ebx,.in_busy
          bts      dword[ebx],eax
          mov      eax,[edi]
          mov      ebx,eax
          shr      ebx,8
          and      eax,0xff
          add      ebx,.in_busy
          bts      dword[ebx],eax
          add      edi,8
          jmp      .search2
        .end_edg:
      ;    mov      ecx,.verts_cnt
      ;    mov      edi,eax
      ;    xor      eax,eax
      ;    shr      ecx,32
      ;    inc      ecx
      ;    cld
      ;    rep      stosd           ; zero busy indicator
          mov     esi,.edges
          mov     edi,.edges2
          mov     ecx,.edg_cnt
          cld
        .nx_ed2:
          movlps  xmm0,[esi]
          lodsd
          xchg    eax,ebx
          movlhps xmm0,xmm0
          imul    ebx,12
          lodsd
          add     ebx,.verts_ptr
          movups  xmm1,[ebx]
          mov     .lenght,dword 10000.0
          push    ecx
          or      ecx,-1
        .nx_v:
          inc     ecx
          cmp     ecx,.verts_cnt
          je      .ed2
          movd    xmm2,ecx
          shufps  xmm2,xmm2,0
          pcmpeqd xmm2,xmm0
          movmskps eax,xmm2
          or       eax,eax
          jnz     .nx_v
          mov     eax,ecx
          imul    eax,12
          add     eax,.verts_ptr
          movups  xmm7,[eax]
          subps   xmm7,xmm1
          dpps    xmm7,xmm7,01110111b
          sqrtps  xmm7,xmm7
          comiss  xmm7,.lenght
          jnb     .nx_v
          movss   .lenght,xmm7
          mov     .curr_v_index,ecx
          jmp     .nx_v
        .ed2:
          pop     ecx
          mov     eax,.curr_v_index
          stosd
  ;        movd    eax,xmm0
  ;        stosd
  ;        loop
          dec   ecx
          jnz   .nx_ed2
        .tris:
      end if
          mov     edi,.tris_ptr
          mov     esi,.edges
          mov     ebx,.edges2
          mov     ecx,.edg_cnt
        @@:
          movsd
          movsd
          mov     eax,[ebx]
          stosd
          add     ebx,4
          loop    @b
        .end:
          mfree    .in_busy
          mfree    .edges
          mov      ecx,.edg_cnt  ;.tri_cnt
          mov   esp,ebp
          pop   ebp
ret
;
if 0
march_cube:
   ; in esi = ptr to vertices
   ;    edi = ptr to tris
   ;    ecx = verts number
     .verts_ptr equ [ebp-4]
     .tris_ptr  equ [ebp-8]
     .verts_cnt equ [ebp-12]
     .zz        equ [ebp-20]
     .zd        equ [ebp-24]
     .yd        equ [ebp-28]
     .xd        equ [ebp-32]
     .zzz       equ [ebp-36]
;     .treshz    equ [ebp-40]
;     .treshy    equ [ebp-44]
     .treshx    equ [ebp-48]     ;  16 bytes inbetween
;     .treshy    equ [ebp-64]     ;  16 bytes ineetween
;     .treshz    equ [ebp-80]     ;  16 bytes inbetween
     .t3        equ [ebp-88]
     .t2        equ [ebp-92]
     .t1        equ [ebp-96]
     push       ebp
     ; first min max all verts
     mov        ebp,esp
     sub        esp,96
     and        ebp,-16
     mov        .verts_cnt,ecx
     mov        .verts_ptr,esi
     mov        .tris_ptr,edi
     xorps      xmm0,xmm0
     xorps      xmm1,xmm1
     movaps     .treshx,xmm0
     movaps     .treshy,xmm0
     movaps     .treshz,xmm0
   @@:
     movups     xmm2,[esi]
     maxps      xmm1,xmm2     ; find bound box
     minps      xmm0,xmm2
     add        esi,12
     loop       @b
     mov        eax,100
     mov        xmm7,xmm1
     sub        xmm7,xmm0
     cvtsi2ss   xmm6,eax
     shufps     xmm6,xmm6,0
     rcpps      xmm6,xmm6
     mulps      xmm6,xmm7  ; xmm6 = deltas
     movups     .xd,xmm6
     mulps      xmm6,[f05x3]
     movaps     .treshx,xmm6
     xor        edx,edx
     mov        .zz,edx
     mov        .zzz,edx
     movaps     xmm3,.xd
     movaps     xmm4,xmm3
     movaps     xmm5,xmm3
     shufps     xmm3,xmm3,11111100b   ; xm3 = delta x
     shufps     xmm4,xmm4,11110111b   ; xm4 = delta y
     shufps     xmm5,xmm5,11101111b   ; xm5 = delta z
;     movaps     xmm7,[f05x3]
;     movaps     xmm6,xmm7
;     mulps      xmm6,xmm3
;     movaps     .treshx,xmm6
;     movaps     xmm6,xmm7
;     mulps      xmm6,xmm4
;     movaps     .treshy,xmm6
;     mulps      xmm7,xmm5
;     movaps     .treshz,xmm6
; I tried determine if at last one vert is in cube corner
     mov        ecx,eax
     push       ecx
     movaps     xmm2,xmm0  ; begining
     mov        ecx,.verts_cnt
     mov        esi,.verts_ptr
     xor        ebx,ebx
     xor        ecx,ecx
  .ll:
     movups     xmm7,[esi]
     movaps     xmm6,xmm7
     subps      xmm7,.treshx
     addps      xmm6,.treshx
     cmpltps    xmm6,xmm2
     cmpltps    xmm7,xmm2
     xorps      xmm6,xmm7
     movmskps   eax,xmm6
     and        eax,111b
     cmp        eax,111b
     jne        @f
     bts        ebx,0       ; 1st cor = busy
    @@:
     add        esi,12
     cmp        ecx,.verts_cnt
     jne        .ll
end if